import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.impute import KNNImputer
import xlrd
import numpy as np
import statistics
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from bioinfokit.analys import get_data
from sklearn.manifold import TSNE
from bioinfokit.visuz import cluster
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
# model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import mean_squared_error
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from datetime import datetime
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import MeanShift
from sklearn.svm import SVR,SVC
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
############### importing countyData ###############
landarea_data = pd.read_csv("CountyData/5296US_landarea.csv")
pop_data = pd.read_csv("CountyData/5296US_pop.csv")
waterarea_data = pd.read_csv("CountyData/5296US_waterarea.csv")
############### importing EconomicData ###############
dowJonesIA_data = pd.read_csv("EconomicData/BCB-UDJIAD1.csv")
countyMHI_data = pd.read_csv("EconomicData/county1.csv") ## Median Household Income
dowJonesIA_historical_data = pd.read_csv("EconomicData/Dow Jones Industrial Average Historical Data 2.csv")
currentDollarGDP_data = pd.read_excel("EconomicData/gdpcounty1218.xlsx","Current Dollar GDP",index_col=None)
currentDollarGDP_data.to_csv()
realGDP = pd.read_excel("EconomicData/gdpcounty1218.xlsx","Real GDP",index_col=None)
realGDP.to_csv()
realGDPGrowth = pd.read_excel("EconomicData/gdpcounty1218.xlsx","Real GDP Growth",index_col=None)
realGDPGrowth.to_csv()
############### importing GeneralDemographicData ###############
county_complete_data = pd.read_csv("GeneralDemographicData/county_complete.csv")
############### importing LifeExpectancyData ###############
lifeExpectancy_data = pd.read_excel("LifeExpectancyData/IHME_USA_LIFE_EXPECTANCY_1987_2007_Y2011M06D16.xlsx")
lifeExpectancy_data.to_csv()
############### importing USElectionResults data ###############
house_data = pd.read_csv("USElectionResults19762020/1976-2020-house.csv")
president_data = pd.read_csv("USElectionResults19762020/1976-2020-president.csv")
senate_data = pd.read_csv("USElectionResults19762020/1976-2020-senate.csv")
county_complete_data = pd.read_csv("GeneralDemographicData/county_complete.csv")
################################# filtering
wanted_states = ['California','Florida','South Dakota','Wyoming']
Handling Land Area Data
landarea_data.head()
| FIPS | State | County | Land Area | |
|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga | 1543.7 |
| 1 | 1003 | Alabama | Baldwin | 4135.0 |
| 2 | 1005 | Alabama | Barbour | 2292.1 |
| 3 | 1007 | Alabama | Bibb | 1611.9 |
| 4 | 1009 | Alabama | Blount | 1672.3 |
landarea_data.isnull().sum()
FIPS 0 State 0 County 0 Land Area 0 dtype: int64
landarea_filtered = landarea_data.loc[landarea_data['State'].isin(wanted_states)]
landarea_filtered
| FIPS | State | County | Land Area | |
|---|---|---|---|---|
| 157 | 6001 | California | Alameda | 1910.1 |
| 158 | 6003 | California | Alpine | 1913.1 |
| 159 | 6005 | California | Amador | 1534.7 |
| 160 | 6007 | California | Butte | 4246.6 |
| 161 | 6009 | California | Calaveras | 2642.3 |
| ... | ... | ... | ... | ... |
| 3106 | 56037 | Wyoming | Sweetwater | 27003.0 |
| 3107 | 56039 | Wyoming | Teton | 10380.6 |
| 3108 | 56041 | Wyoming | Uinta | 5391.7 |
| 3109 | 56043 | Wyoming | Washakie | 5802.0 |
| 3110 | 56045 | Wyoming | Weston | 6210.6 |
214 rows × 4 columns
cali_land = landarea_filtered[landarea_filtered['State'] == 'California']
wy_land = landarea_filtered[landarea_filtered['State'] == 'Wyoming']
fl_land = landarea_filtered[landarea_filtered['State'] == 'Florida']
sd_land = landarea_filtered[landarea_filtered['State'] == 'South Dakota']
Handling outliers
my_dict = {'California': cali_land['Land Area'], 'Wyoming': wy_land['Land Area'],'Florida':fl_land['Land Area'], 'South Dakota': sd_land['Land Area']}
fig, ax = plt.subplots()
ax
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.show()
out=[]
def iqr_outliers(df):
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr
for i in df:
if i > Upper_tail or i < Lower_tail:
out.append(i)
print("Outliers:",out)
return out
cali_land.shape
(58, 4)
outliers = iqr_outliers(pd.Series(cali_land['Land Area'].unique()))
cali_land = cali_land[~cali_land['Land Area'].isin(outliers)]
Outliers: [26397.5, 21086.8, 51960.0]
cali_land.shape
(55, 4)
outliers = iqr_outliers(pd.Series(fl_land['Land Area']))
fl_land = fl_land[~fl_land['Land Area'].isin(outliers)]
Outliers: [26397.5, 21086.8, 51960.0, 5245.9, 5036.2, 5268.9, 4856.1]
outliers = iqr_outliers(pd.Series(wy_land['Land Area']))
wy_land = wy_land[~wy_land['Land Area'].isin(outliers)]
Outliers: [26397.5, 21086.8, 51960.0, 5245.9, 5036.2, 5268.9, 4856.1, 23783.1, 27003.0]
outliers = iqr_outliers(pd.Series(sd_land['Land Area']))
sd_land = sd_land[~sd_land['Land Area'].isin(outliers)]
Outliers: [26397.5, 21086.8, 51960.0, 5245.9, 5036.2, 5268.9, 4856.1, 23783.1, 27003.0, 8989.5, 7442.4]
land_area = pd.concat([cali_land, wy_land,fl_land,sd_land])
land_area.shape
(203, 4)
my_dict = {'California': cali_land['Land Area'], 'Wyoming': wy_land['Land Area'],'Florida':fl_land['Land Area'], 'South Dakota': sd_land['Land Area']}
fig, ax = plt.subplots()
ax
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.show()
County_Complete dataset
county_complete_data.head()
| fips | state | name | pop2000 | pop2010 | pop2011 | pop2012 | pop2013 | pop2014 | pop2015 | ... | poverty_under_18_2019 | two_plus_races_2019 | unemployment_rate_2019 | uninsured_2019 | uninsured_65_and_older_2019 | uninsured_under_19_2019 | uninsured_under_6_2019 | veterans_2019 | white_2019 | white_not_hispanic_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | Alabama | Autauga County | 43671.0 | 54571 | 55199.0 | 54927.0 | 54695.0 | 54864.0 | 54838.0 | ... | 23.2 | 2.2 | 3.5 | 7.1 | 0.0 | 1.7 | 1.7 | 12.6 | 76.8 | 74.6 |
| 1 | 1003 | Alabama | Baldwin County | 140415.0 | 182265 | 186534.0 | 190048.0 | 194736.0 | 199064.0 | 202863.0 | ... | 13.4 | 1.7 | 4.0 | 8.9 | 0.3 | 3.8 | 2.2 | 11.8 | 86.2 | 83.1 |
| 2 | 1005 | Alabama | Barbour County | 29038.0 | 27457 | 27351.0 | 27175.0 | 26947.0 | 26749.0 | 26264.0 | ... | 50.1 | 1.2 | 9.4 | 11.3 | 0.3 | 3.3 | 3.4 | 6.6 | 46.8 | 45.8 |
| 3 | 1007 | Alabama | Bibb County | 20826.0 | 22915 | 22745.0 | 22658.0 | 22503.0 | 22533.0 | 22561.0 | ... | NaN | 0.6 | 7.0 | 10.7 | 0.0 | 2.0 | 4.5 | 8.0 | 76.8 | 74.5 |
| 4 | 1009 | Alabama | Blount County | 51024.0 | 57322 | 57562.0 | 57595.0 | 57623.0 | 57546.0 | 57590.0 | ... | 18.4 | 1.6 | 3.1 | 10.8 | 0.2 | 5.9 | 6.1 | 7.7 | 95.5 | 86.9 |
5 rows × 188 columns
county_data_filtered = county_complete_data.loc[county_complete_data['state'].isin(wanted_states)]
county_data_filtered.dtypes
fips int64
state object
name object
pop2000 float64
pop2010 int64
...
uninsured_under_19_2019 float64
uninsured_under_6_2019 float64
veterans_2019 float64
white_2019 float64
white_not_hispanic_2019 float64
Length: 188, dtype: object
county_cat = county_data_filtered.dtypes[county_data_filtered.dtypes == 'object']
county_cat
state object name object smoking_ban_2010 object uninsured_age_under_6_2017 object dtype: object
## converted this feature to float after identifying its null values
county_data_filtered['uninsured_age_under_6_2017'].replace('-',None,inplace=True)
county_data_filtered['uninsured_age_under_6_2017'].replace(np.nan,None,inplace=True)
county_data_filtered['uninsured_age_under_6_2017'] = county_data_filtered['uninsured_age_under_6_2017'].astype(float)
county_cat = county_data_filtered.dtypes[county_data_filtered.dtypes == 'object']
county_cat
state object name object smoking_ban_2010 object dtype: object
county_nulls = county_data_filtered.isnull().sum()
county_nulls[county_nulls > 0]
age_under_5_2017 1
age_over_65_2017 1
median_age_2017 1
black_2010 4
black_2017 1
...
unemployment_rate_2017 1
mean_work_travel_2019 98
poverty_2019 115
poverty_65_and_over_2019 115
poverty_under_18_2019 115
Length: 100, dtype: int64
Dropping features that their null values are more than half
county_data_filtered.shape
(214, 188)
limitPer = county_data_filtered.shape[0] * .50
county_data_filtered.dropna(thresh=limitPer, axis=1, inplace=True)
county_data_filtered.shape
(214, 180)
## one hot encoding for this feature
oe_smoking = OrdinalEncoder(categories=[['none', 'partial','comprehensive']])
county_data_filtered['smoking_ban_2010'] = oe_smoking.fit_transform(county_data_filtered[['smoking_ban_2010']])
county_data_filtered['smoking_ban_2010'] = county_data_filtered['smoking_ban_2010'].astype('category')
county_nulls = county_data_filtered.isnull().sum()
county_nulls[county_nulls > 0]
age_under_5_2017 1
age_over_65_2017 1
median_age_2017 1
black_2010 4
black_2017 1
..
civilian_labor_force_2017 1
employed_2017 1
unemployed_2017 1
unemployment_rate_2017 1
mean_work_travel_2019 98
Length: 92, dtype: int64
We will drop features that are highly correlated
county_complete_numericals = county_data_filtered.iloc[:,3:]
county_complete_numericals = county_complete_numericals.select_dtypes(include='number')
plt.figure(figsize=(250, 250))
sns.heatmap(county_complete_numericals.corr(), annot=True, cmap='mako_r')
plt.show()